Assignment 4

Richard Pham

11/15/2021

Load Packages

library(tidyverse)
library(forecast)
library(ggcorrplot)
library(reshape2)
library(DT)
library(tidymodels)
library(caret)
library(DAAG)
library(plotly)
library(lemon)
library(cluster.datasets)
library(cluster)
library(factoextra)
library(rpart)
library(rpart.plot)

Load Data

dcbikeshare <- read_csv("dcbikeshare.csv")
mall_customers <- read_csv("Mall_Customers.csv")
titanic <- read_csv("titanic-1-1.csv")

Part 1

Question 1

dcbikeshare <- dcbikeshare %>%
  mutate(season = factor(
    season, 
    levels = c(2, 3, 4, 1), 
    labels = c("spring", "summer", "fall", "winter")
  ))
dcbikeshare
## # A tibble: 731 x 16
##    instant dteday     season    yr  mnth holiday weekday workingday weathersit
##      <dbl> <date>     <fct>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>      <dbl>
##  1       1 2011-01-01 winter     0     1       0       6          0          2
##  2       2 2011-01-02 winter     0     1       0       0          0          2
##  3       3 2011-01-03 winter     0     1       0       1          1          1
##  4       4 2011-01-04 winter     0     1       0       2          1          1
##  5       5 2011-01-05 winter     0     1       0       3          1          1
##  6       6 2011-01-06 winter     0     1       0       4          1          1
##  7       7 2011-01-07 winter     0     1       0       5          1          2
##  8       8 2011-01-08 winter     0     1       0       6          0          2
##  9       9 2011-01-09 winter     0     1       0       0          0          1
## 10      10 2011-01-10 winter     0     1       0       1          1          1
## # ... with 721 more rows, and 7 more variables: temp <dbl>, atemp <dbl>,
## #   hum <dbl>, windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>

Question 2

dcbikeshare <- dcbikeshare %>%
  mutate(
    holiday = ifelse(holiday == 0, "no", "yes"),      
    holiday = fct_relevel(holiday, "no", "yes"),    
    workingday = ifelse(workingday == 0, "no", "yes"),
    workingday = fct_relevel(workingday, "no", "yes")
  )

Question 3

dcbikeshare <- dcbikeshare %>%
  mutate(
    yr = ifelse(yr == 0, "2011", "2012"),
    yr = fct_relevel(yr, "2011", "2012")
  )

Question 4

dcbikeshare <- dcbikeshare %>%
  mutate(weathersit = factor(
    weathersit, 
    levels = 1:4,
    labels = c(
      "clear", 
      "mist", 
      "light precipitation", 
      "heavy precipitation"
    )))

Question 6

cnt_dailytemp <- linear_reg() %>%
  set_engine("lm") %>%
  fit(cnt ~ temp, data = dcbikeshare)
cnt_dailytemp %>% 
  tidy()
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    1215.      161.      7.54 1.43e-13
## 2 temp           6641.      305.     21.8  2.81e-81
cnt_dailytemp %>% 
  glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl>  <dbl>  <dbl>
## 1     0.394         0.393 1509.      473. 2.81e-81     1 -6387. 12780. 12793.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

The model shows that on a day with a temp. of 0 degrees Celsius, there can be an expected 1215 bike rentals, on average. Furthermore, for every degree increase, there will be an increase of 161 bike rentals, as the model’s standard error is approximately 161.

Question 7

cnt_all <- linear_reg() %>%
  set_engine("lm") %>%
  fit(cnt ~ season + yr + holiday + workingday + weathersit + temp + atemp + hum + windspeed + temp * holiday, data = dcbikeshare)
cnt_all %>%
  tidy()
## # A tibble: 14 x 5
##    term                          estimate std.error statistic   p.value
##    <chr>                            <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                      2717.     268.     10.1   1.36e- 22
##  2 seasonsummer                     -277.     100.     -2.76  5.98e-  3
##  3 seasonfall                        410.      96.1     4.27  2.24e-  5
##  4 seasonwinter                    -1131.     113.     -9.96  5.50e- 22
##  5 yr2012                           2014.      61.7    32.7   5.12e-144
##  6 holidayyes                      -1397.     471.     -2.97  3.11e-  3
##  7 workingdayyes                     120.      67.8     1.77  7.79e-  2
##  8 weathersitmist                   -420.      81.3    -5.18  2.96e-  7
##  9 weathersitlight precipitation   -1908.     207.     -9.19  4.03e- 19
## 10 temp                             4210.    1394.      3.02  2.62e-  3
## 11 atemp                             947.    1522.      0.622 5.34e-  1
## 12 hum                             -1359.     296.     -4.60  5.07e-  6
## 13 windspeed                       -2722.     435.     -6.26  6.54e- 10
## 14 holidayyes:temp                  1669.     926.      1.80  7.20e-  2
glance(cnt_all)$adj.r.squared
## [1] 0.820292

The adjusted R-squared value for the model is 0.82.

Part 2

Question 1

mall_customers %>%
  ggplot(aes(x = Age, y = `Annual Income (k$)`, color = Sex)) +
  geom_point(na.rm = TRUE) +
  geom_smooth(method = "loess") +
  labs(title = "Annual Income vs Sex") +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

As shown in the scatter plot above, from the age range of 20-40, male annual income is greater than female annual income before they begin trending closer together past the age of 40.

mall_customers %>%
  ggplot(aes(x = Age, y = `Spending Score (1-100)`, color = Sex)) +
  geom_point(na.rm = TRUE) +
  geom_smooth(method = "loess") +
  labs(title = "Spending Score vs Age") +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

From this scatter plot, it can be observed that Spending Score is high in both genders at the younger age range of 20-40, before it trends lower past age 40.

mall_customers_histogram <- mall_customers %>%
  ggplot()+
  aes(x = `Spending Score (1-100)`, fill = Sex)+
  geom_histogram(bins = 50)+
  facet_rep_wrap(~Sex, repeat.tick.labels = TRUE)+
  labs(title = "Count of Spending Score of Both Genders", y = "Frequency") +
  theme_minimal()

ggplotly(mall_customers_histogram)

For females, the most observed Spending Score was 42. For males, the most observed Spending Score was 46.

Question 2

set.seed(1)
mall_customers_cluster <- kmeans(mall_customers[,4:5], centers = 5, nstart = 10)
mall_customers_cluster
## K-means clustering with 5 clusters of sizes 22, 35, 81, 39, 23
## 
## Cluster means:
##   Annual Income (k$) Spending Score (1-100)
## 1           25.72727               79.36364
## 2           88.20000               17.11429
## 3           55.29630               49.51852
## 4           86.53846               82.12821
## 5           26.30435               20.91304
## 
## Clustering vector:
##   [1] 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5
##  [38] 1 5 1 5 1 5 3 5 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 3 4 2 4 2 4 3 4 2 4 2 4 2 4 2 4 3 4 2 4 2 4
## [149] 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [186] 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## 
## Within cluster sum of squares by cluster:
## [1]  3519.455 12511.143  9875.111 13444.051  5098.696
##  (between_SS / total_SS =  83.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Initial clusters that look at Annual Income and Spending Score.

set.seed(1)
options(scipen = 999)
fviz_nbclust( mall_customers[, 4:5],kmeans,method = "wss") +
theme_minimal()

Elbow plots like the one above show the ideal number of clusters to be used in a data set by looking at where the elbow curves. In this case, I used 5 for the clusters prior. My recommendation for the correct amount of clusters to be used for this data set is 6.

Question 3

mall_customers_cluster_df5 <- mall_customers %>%
  mutate(cluster = as.character(mall_customers_cluster$cluster))

ggplot() +
  geom_point(data = mall_customers_cluster_df5, mapping = aes(x = mall_customers_cluster_df5$`Annual Income (k$)`, y = mall_customers_cluster_df5$`Spending Score (1-100)`, color = cluster)) +
  geom_point(mapping = aes(x = mall_customers_cluster$centers[,1],y =        mall_customers_cluster$centers[,2]), color = "red",size = 3)+
scale_color_discrete(name=" ", breaks=c("1", "2", "3", "4", "5"), labels=c("Spendthrift", "Frugal", "General", "Target", "Careful"))+
labs(title ="K-means Clustering of Mall Customers", x = "Annual Income (k$)", y = "Spending Score")

The clustering shows that there are 5 distinct groups of mall customers. This new insight would tell me to start marketing towards the frugal and careful groups. Also, I would continue to market towards the Target group as they are the larger spenders.

Part 3

Question 1

params <- titanic %>%
    filter(!is.na(Age)) %>%
    summarize(mean = mean(Age), sd = sd(Age)) 

titanic %>%
  ggplot(aes(sample = Age)) +
  geom_qq(dparams = params) +
  geom_abline() +
  theme_minimal()
## Warning: Removed 177 rows containing non-finite values (stat_qq).

Using the parameters above, I filtered out the ages of NA and used the mean ages and standard deviation of ages and geom_qq and geom_abline to create a QQ-plot of age distribution of the passengers.

Question 2

titanic <- titanic %>%
  mutate(
    Survived = ifelse(Survived == 0, "Did Not Survive", "Survived"),     
    Survived = fct_relevel(Survived, "Did Not Survive", "Survived"),    
  ) %>%
  select(-Name, -PassengerId, -Ticket)

set.seed(1)
split <- initial_split(titanic, prop = 0.6)
training_data <- training(split)
validation_data <- testing(split)

set.seed(1)
passenger_tree <- rpart(Survived ~ Age + Pclass + Sex + SibSp + Parch + Fare + Cabin + Embarked, data = training_data, parms = list(split = "gini"), method = "class")

prp(passenger_tree, faclen = 0, varlen =0, cex = 0.75, yesno = 2)

printcp(passenger_tree)
## 
## Classification tree:
## rpart(formula = Survived ~ Age + Pclass + Sex + SibSp + Parch + 
##     Fare + Cabin + Embarked, data = training_data, method = "class", 
##     parms = list(split = "gini"))
## 
## Variables actually used in tree construction:
## [1] Age    Fare   Pclass Sex    SibSp 
## 
## Root node error: 206/534 = 0.38577
## 
## n= 534 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.475728      0   1.00000 1.00000 0.054605
## 2 0.029126      1   0.52427 0.52427 0.045059
## 3 0.024272      3   0.46602 0.54369 0.045670
## 4 0.019417      6   0.39320 0.52427 0.045059
## 5 0.010000      8   0.35437 0.49515 0.044096

Question 3

prediction_test <- predict(passenger_tree, newdata = training_data, type = "class") 
prediction_test2 <- predict(passenger_tree, newdata = validation_data, type = "class")

training_data$predicted <- prediction_test 

options(scipen = 999)

confusionMatrix(prediction_test, as.factor(training_data$Survived)) 
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Did Not Survive Survived
##   Did Not Survive             312       57
##   Survived                     16      149
##                                                
##                Accuracy : 0.8633               
##                  95% CI : (0.8312, 0.8913)     
##     No Information Rate : 0.6142               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7004               
##                                                
##  Mcnemar's Test P-Value : 0.000002846          
##                                                
##             Sensitivity : 0.9512               
##             Specificity : 0.7233               
##          Pos Pred Value : 0.8455               
##          Neg Pred Value : 0.9030               
##              Prevalence : 0.6142               
##          Detection Rate : 0.5843               
##    Detection Prevalence : 0.6910               
##       Balanced Accuracy : 0.8373               
##                                                
##        'Positive' Class : Did Not Survive      
## 

The confusion matrix shows that the classifier is accurate most of the time. The model is significant as the accuracy is higher than the no information rate. The high specificity shows that the model had good performance in predicting who actually died.